Data Description

This dataset contains house sale prices for King County, which includes Seattle. It includes homes sold between May 2014 and May 2015.

Imports

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

import os
import time

# random state
random_state=100
np.random.seed(random_state) # we need this in each cell
np.random.set_state=random_state

# Jupyter notebook settings for pandas
pd.set_option('display.max_columns', 50)
pd.set_option('display.float_format', '{:,.2g}'.format) # numbers sep by comma
pd.set_option('display.max_rows', 20) # None for all the rows
pd.set_option('display.max_colwidth', 50)

import IPython
from IPython.display import display

print([(x.__name__,x.__version__) for x in [np, pd,sns,matplotlib]])
[('numpy', '1.16.4'), ('pandas', '0.25.0'), ('seaborn', '0.9.0'), ('matplotlib', '3.1.1')]
In [2]:
%load_ext autoreload
%autoreload 2
In [3]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;
In [4]:
import bokeh
from bokeh.io import output_file, output_notebook
from bokeh.plotting import figure, show, reset_output
from bokeh.models import ColumnDataSource
from bokeh.layouts import row, column, gridplot
from bokeh.models.widgets import Tabs, Panel
from bokeh.palettes import Spectral6
from bokeh.models import ColumnDataSource,FactorRange

# Output the visualization directly in the notebook
output_notebook()

[(x.__name__,x.__version__) for x in [bokeh]]
Loading BokehJS ...
Out[4]:
[('bokeh', '1.3.4')]
In [5]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;

Useful Scripts

In [6]:
def show_method_attributes(method, ncols=7):
    """ Show all the attributes of a given method.
    Example:
    ========
    show_method_attributes(list)
     """
    x = [i for i in dir(method) if i[0]!='_']
    x = [i for i in x if i not in 'os np pd sys time psycopg2'.split()]

    return pd.DataFrame(np.array_split(x,ncols)).T.fillna('')

Load the data

In [7]:
df = pd.read_csv('../data/processed/data_cleaned_encoded.csv')
print(df.shape)
df.head().T
(21613, 92)
Out[7]:
0 1 2 3 4
id 7129300520 6414100192 5631500400 2487200875 1954400510
date 2014-10-13 2014-12-09 2015-02-25 2014-12-09 2015-02-18
price 2.2e+05 5.4e+05 1.8e+05 6e+05 5.1e+05
bedrooms 3 3 2 4 3
bathrooms 1 2.2 1 3 2
sqft_living 1180 2570 770 1960 1680
sqft_lot 5650 7242 10000 5000 8080
floors 1 2 1 1 1
waterfront 0 0 0 0 0
view 0 0 0 0 0
... ... ... ... ... ...
age_after_renovation_cat_7 0 0 1 0 0
age_after_renovation_cat_8 0 0 0 0 0
age_after_renovation_cat_9 0 0 0 0 0
log1p_price 12 13 12 13 13
log1p_sqft_living 7.1 7.9 6.6 7.6 7.4
log1p_sqft_lot 8.6 8.9 9.2 8.5 9
log1p_sqft_above 7.1 7.7 6.6 7 7.4
log1p_sqft_basement 0 6 0 6.8 0
log1p_sqft_living15 7.2 7.4 7.9 7.2 7.5
log1p_sqft_lot15 8.6 8.9 9 8.5 8.9

92 rows × 5 columns

Univariate Analysis

Discrete variables and Categorical variables

In [8]:
df['bedrooms'].value_counts()
Out[8]:
3     9824
4     6882
2     2760
5     1601
6      272
1      199
7       38
8       13
0       13
9        6
10       3
11       1
33       1
Name: bedrooms, dtype: int64
In [9]:
from bhishan.util_bokeh import countplot_bokeh
In [10]:
ofile = '../reports/bokeh_outputs/bedrooms_countplot.html'
countplot_bokeh(df, 'bedrooms',height=400,ofile=None)
Loading BokehJS ...
In [11]:
countplot_bokeh(df,'view',height=300)
Loading BokehJS ...

Continuous variables

histograms

In [12]:
from bhishan.util_bokeh import histogram_bokeh

histogram_bokeh(df,'sqft_living',n_bins=20)
Loading BokehJS ...

Bi-variate Analysis

Scatter plots

In [13]:
from bhishan.util_bokeh import scatterplot_bokeh
In [14]:
ofile = '../reports/bokeh_outputs/sqftLiving_vs_price.html'
scatterplot_bokeh(df,'sqft_living','price',ofile=ofile)

Multi-variate Analysis

countplots

In [15]:
from bhishan.util_bokeh import stacked_countplot_bokeh

stacked_countplot_bokeh(df,'bedrooms','yr_sales','price')
Loading BokehJS ...

Map visualization

In [16]:
df[['lat','long']].head()
Out[16]:
lat long
0 48 -1.2e+02
1 48 -1.2e+02
2 48 -1.2e+02
3 48 -1.2e+02
4 48 -1.2e+02
In [17]:
from bhishan.util_bokeh import map_plot_bokeh
In [18]:
ofile = '../reports/bokeh_outputs/map.html'
map_plot_bokeh(df, 'lat', 'long',ofile)